Pandas Profiling#

Import Libraries#

Standard Libraries#

import os
import logging
logging.getLogger().setLevel(logging.CRITICAL)

External Libraries#

import pandas as pd
import geopandas as gpd
from ydata_profiling import ProfileReport, config
config.Settings(progress_bar = False)
Settings(title='Pandas Profiling Report', dataset=Dataset(description='', creator='', author='', copyright_holder='', copyright_year='', url=''), variables=Variables(descriptions={}), infer_dtypes=True, show_variable_description=True, pool_size=0, progress_bar=False, vars=Univariate(num=NumVars(quantiles=[0.05, 0.25, 0.5, 0.75, 0.95], skewness_threshold=20, low_categorical_threshold=5, chi_squared_threshold=0.999), cat=CatVars(length=True, characters=True, words=True, cardinality_threshold=50, imbalance_threshold=0.5, n_obs=5, chi_squared_threshold=0.999, coerce_str_to_date=False, redact=False, histogram_largest=50, stop_words=[]), image=ImageVars(active=False, exif=True, hash=True), bool=BoolVars(n_obs=3, imbalance_threshold=0.5, mappings={'t': True, 'f': False, 'yes': True, 'no': False, 'y': True, 'n': False, 'true': True, 'false': False}), path=PathVars(active=False), file=FileVars(active=False), url=UrlVars(active=False), timeseries=TimeseriesVars(active=False, sortby=None, autocorrelation=0.7, lags=[1, 7, 12, 24, 30], significance=0.05, pacf_acf_lag=100)), sort=None, missing_diagrams={'bar': True, 'matrix': True, 'heatmap': True}, correlation_table=True, correlations={'auto': Correlation(key='auto', calculate=True, warn_high_correlations=10, threshold=0.5, n_bins=10), 'spearman': Correlation(key='spearman', calculate=False, warn_high_correlations=10, threshold=0.5, n_bins=10), 'pearson': Correlation(key='pearson', calculate=False, warn_high_correlations=10, threshold=0.5, n_bins=10), 'phi_k': Correlation(key='phi_k', calculate=False, warn_high_correlations=10, threshold=0.5, n_bins=10), 'cramers': Correlation(key='cramers', calculate=False, warn_high_correlations=10, threshold=0.5, n_bins=10), 'kendall': Correlation(key='kendall', calculate=False, warn_high_correlations=10, threshold=0.5, n_bins=10)}, interactions=Interactions(continuous=True, targets=[]), categorical_maximum_correlation_distinct=100, memory_deep=False, plot=Plot(missing=MissingPlot(force_labels=True, cmap='RdBu'), image_format=<ImageType.svg: 'svg'>, correlation=CorrelationPlot(cmap='RdBu', bad='#000000'), dpi=800, histogram=Histogram(bins=50, max_bins=250, x_axis_labels=True), scatter_threshold=1000, cat_freq=CatFrequencyPlot(show=True, type='bar', max_unique=10, colors=None)), duplicates=Duplicates(head=10, key='# duplicates'), samples=Samples(head=10, tail=10, random=0), reject_variables=True, n_obs_unique=10, n_freq_table_max=10, n_extreme_obs=10, report=Report(precision=8), html=Html(style=Style(primary_colors=['#377eb8', '#e41a1c', '#4daf4a'], logo='', theme=None), navbar_show=True, minify_html=True, use_local_assets=True, inline=True, assets_prefix=None, assets_path=None, full_width=False), notebook=Notebook(iframe=Iframe(height='800px', width='100%', attribute=<IframeAttribute.srcdoc: 'srcdoc'>)))

Define Variables#

Input Files#

flood_mappluto_folder = 'data/merge/nyc-street-flooding-mappluto.gdb/'

Get Street Flooding with MapPLUTO Dataset#

flood_mappluto_gdf = gpd.read_file(
    flood_mappluto_folder,
    file = 'FileGDB',
    rows = 5
)
# flood_mappluto_gdf.reset_index(inplace=True)
"""preview_columns_merge_list = [
    'borough',
    'created_date', 
    'street_name', 
    'bbl', 
    'ZipCode', 
    # 'geometry',
    'Tract2010',
    'TaxMap'
]"""
"preview_columns_merge_list = [\n    'borough',\n    'created_date', \n    'street_name', \n    'bbl', \n    'ZipCode', \n    # 'geometry',\n    'Tract2010',\n    'TaxMap'\n]"
unsupported_columns = [
    'geometry',
    'intersection_street_2',
    'road_ramp',
    'bridge_highway_direction',
    'bridge_highway_segment',
    'bridge_highway_name',
    'location_type',
    'due_date',
    'taxi_company_borough',
    'taxi_pick_up_location',
    'intersection_street_1',
    'vehicle_type',
    'landmark',
    'ZoneDist2',
    'ZoneDist3',
    'ZoneDist4',
    'Overlay1',
    'Overlay2',
    'SPDist2',
    'SPDist3',
    'LtdHeight',
    'OwnerType',
    'HistDist',
    'Landmark_1',
    'CondoNo',
    'EDesigNum',
    'APPBBL',
    'APPDate',
    'FIRM07_FLAG',
    'DCPEdited',
    'Notes',
]
flood_mappluto_supported_cols_gdf = flood_mappluto_gdf.copy()
flood_mappluto_supported_cols_gdf.drop(columns = unsupported_columns, inplace = True)

Profile Street Flooding with MapPLUTO Dataset#

profile = ProfileReport(flood_mappluto_supported_cols_gdf)
profile.to_notebook_iframe()